This dataset contains credit card transactions made by European cardholders in the year 2023. It comprises over 550,000 records, and the data has been anonymized to protect the cardholders' identities. The primary objective of this dataset is to facilitate the development of fraud detection algorithms and models to identify potentially fraudulent transactions. (Taken from Kaggle.com)
- Utilized four classification models (Logistic Regression, Decision Tree, Random Forest. and XGBoost)
- Tested models for accuracy, precision, recall, F1-score, AUC-ROC, and training time
- Performed on tests with reduced dimensionality
- Found little changes in metric values but a big decrease in training time after dimensionality reduction
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from xgboost import plot_tree as xgb_plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("creditcard_2023.csv")
df
df.head() # Look at first few rows
print(f"Number of Rows: {len(df)}") # Find number of rows
print(f"Number of Columns: {len(df.iloc[0])}") # Number of Columns
df.info() # See the data types of each column
# Check for missing values
df.isnull().sum(axis = 0) # There seem to be no missing values
# Descriptive Statistics
df.describe()
# Drop the ID since they are all unique and do not contribute to the Class
df = df.drop("id", axis=1)
# Separating data by class
legit_data = df[df["Class"] == 0]
fraud_data = df[df["Class"] == 1]
# Plot distribution of all the columns
sns.set(style="whitegrid")
# Loop through each column in the DataFrame
for column in df.columns:
if column != 'Class': # Exclude the target column
fig, axes = plt.subplots(nrows=1, ncols=2, figsize= (10,7))
# Plot the distribution for legitimate transactions
sns.histplot(legit_data[column], kde=True, color='blue', label='Legitimate', ax=axes[0])
# Plot the distribution for fraudulent transactions
sns.histplot(fraud_data[column], kde=True, color='red', label='Fraudulent', ax = axes[0])
# Plot the boxplot for legitimate transactions
sns.boxplot(x='Class', y=column, data=df, palette='viridis', ax=axes[1])
axes[0].set_title(f'Distribution of {column} by Target')
axes[0].set_xlabel(column)
axes[0].set_ylabel('Frequency')
axes[0].legend()
axes[1].set_title(f"Boxplot of {column} by Target")
axes[1].set_xlabel("Target")
axes[1].set_ylabel(column)
plt.tight_layout()
plt.show()
From the EDA we gain a few insights
# Plotting Correlation Matrix
# Combine legitimate and fraudulent data for correlation analysis
combined_data = pd.concat([df.drop('Class', axis=1), df['Class']], axis=1)
# Calculate the correlation matrix
correlation_matrix = combined_data.corr()
# Set up the matplotlib figure
plt.figure(figsize=(12, 10))
# Create a heatmap using seaborn
sns.heatmap(correlation_matrix[['Class']], annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap with Target (Legit: 0, Fraud: 1)')
plt.show()
The correlation heatmap of each feature to the target class show that a few of the features are correlated to the target class. This includes V3, V4, V11, V12, and V14. This can be useful with possible dimensionality reduction to utilize only these features to achieve comparable results to using all the features.
# Plotting Cross-Feature Correlation
feature_correlation_matrix = df.drop('Class', axis=1).corr()
# Set up the matplotlib figure
plt.figure(figsize=(16, 14))
# Create a heatmap using seaborn
sns.heatmap(feature_correlation_matrix, cmap='coolwarm', annot=True, fmt=".2f", linewidths=.5)
plt.title('Cross-Features Correlation Heatmap')
plt.show()
When looking at the cross-features correlation heatmap, the main insight is that features V16, V17, and V18 seem to be correlated among one another as we seen large positive correlations among these three features. This is a potential avenue for dimensionality reduction as this may allow us to remove some columns as the correlation with other columns makes them redundant.
# Find class counts to check if data is balanced
df["Class"].value_counts()
The data is balanced
X = df.drop("Class", axis = 1)
y = df[["Class"]]
cols = X.columns
scaler = RobustScaler() # Using RobustScaler to standardize the data while taking into account the large number of outliers
X[cols] = scaler.fit_transform(X[cols])
# Plot distribution of all the columns
sns.set(style="whitegrid")
# Loop through each column in the DataFrame
for column in cols:
if column != 'Class': # Exclude the target column
plt.figure(figsize=(12,6))
# Plot the distribution for legitimate transactions
sns.histplot(x = column, data = df, kde=True, color='blue', label='Unscaled')
# Plot the distribution for fraudulent transactions
sns.histplot(x = column, data = X, kde=True, color='red', label='Scaled')
plt.title(f'Distribution of {column} Before and After Robust Scaling')
plt.xlabel(column)
plt.ylabel('Frequency')
plt.legend()
plt.show()
log = LogisticRegression() # Initialize Logistic Regression Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Partition the data
log.fit(X_train, y_train)
# Predict probabilities on the test set
y_prob = log.predict_proba(X_test)[:, 1]
# Calculate AUC-ROC
auc_roc = roc_auc_score(y_test, y_prob)
print(f'AUC-ROC: {auc_roc:.4f}')
# Plot the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'AUC = {auc_roc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
plt.title('ROC Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()
# Generate a confusion matrix
y_pred = log.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
tree = DecisionTreeClassifier() # Initialize Decision Tree Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10) # Partition the data
tree.fit(X_train, y_train)
# Predict probabilities on the test set
y_prob = tree.predict_proba(X_test)[:, 1]
# Calculate AUC-ROC
auc_roc = roc_auc_score(y_test, y_prob)
print(f'AUC-ROC: {auc_roc:.4f}')
# Plot the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'AUC = {auc_roc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
plt.title('ROC Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()
# Generate a confusion matrix
y_pred = tree.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
# Plot Decision Tree
fig = plt.figure(figsize=(14,6))
ax = plot_tree(tree, feature_names=X.columns.to_list(), class_names='Class', filled=True)
plt.title('Decision Tree')
plt.show()
forest = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32) # Partition the data
forest.fit(X_train, y_train)
# Predict probabilities on the test set
y_prob = forest.predict_proba(X_test)[:, 1]
# Calculate AUC-ROC
auc_roc = roc_auc_score(y_test, y_prob)
print(f'AUC-ROC: {auc_roc:.4f}')
# Plot the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'AUC = {auc_roc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
plt.title('ROC Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()
# Generate a confusion matrix
y_pred = forest.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
importances = forest.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': np.round(importances, 3)})
feature_importances = feature_importances.sort_values('importance', ascending=False)
fig = plt.figure(figsize=(14,6))
ax = sns.barplot(x='importance', y='feature', data=feature_importances)
ax.bar_label(ax.containers[0])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Random Forest Feature Importance')
plt.show()
xgb = XGBClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Partition the data
xgb.fit(X_train, y_train)
# Predict probabilities on the test set
y_prob = xgb.predict_proba(X_test)[:, 1]
# Calculate AUC-ROC
auc_roc = roc_auc_score(y_test, y_prob)
print(f'AUC-ROC: {auc_roc:.4f}')
# Plot the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'AUC = {auc_roc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
plt.title('ROC Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()
# Generate a confusion matrix
y_pred = xgb.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
importances = xgb.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': np.round(importances, 3)})
feature_importances = feature_importances.sort_values('importance', ascending=False)
fig = plt.figure(figsize=(14,6))
ax = sns.barplot(x='importance', y='feature', data=feature_importances)
ax.bar_label(ax.containers[0])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('XGBoost Feature Importance')
plt.show()
fig, ax = plt.subplots(figsize=(30, 30))
xgb_plot_tree(xgb, num_trees=0, rankdir='LR', ax=ax)
plt.show()
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize models
models = {
'Logistic Regression': LogisticRegression(),
'Random Forest': RandomForestClassifier(),
'Decision Tree': DecisionTreeClassifier(),
'XGBoost': XGBClassifier()
}
# Train and evaluate each model
results = {}
for name, model in models.items():
start_time = time.time()
model.fit(X_train, y_train)
# Predict probabilities on the test set
y_prob = model.predict_proba(X_test)[:, 1]
# Evaluate performance
results[name] = {
'Accuracy': accuracy_score(y_test, model.predict(X_test)),
'Precision': precision_score(y_test, model.predict(X_test)),
'Recall': recall_score(y_test, model.predict(X_test)),
'F1 Score': f1_score(y_test, model.predict(X_test)),
'AUC-ROC': roc_auc_score(y_test, y_prob),
"Training Time (sec)" : time.time() - start_time
}
# Display the results
results_df = pd.DataFrame(results).T
results_df
Upon utilizing all the features in the data to make predictions we see some useful comparisons
# Plot ROC curves
plt.figure(figsize=(10, 6))
for name, model in models.items():
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f'{name} (AUC = {results[name]["AUC-ROC"]:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
plt.title('ROC Curves for Different Models')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()
Now I will do the same set of predictive models as before, however I will only use the top 5 features described by the initial Random Forest as being the most significant (V4, V10, V12, V14, V17)
# Reducing the data to the top 5 features as seen from the Random Forest
X_reduced = df[["V4", "V10", "V12", "V14", "V17"]]
# Split the data into training and testing sets
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
# Initialize models
models_reduced = {
'Logistic Regression': LogisticRegression(),
'Random Forest': RandomForestClassifier(),
'Decision Tree': DecisionTreeClassifier(),
'XGBoost': XGBClassifier()
}
# Train and evaluate each model
results_reduced = {}
for name, model in models_reduced.items():
start_time = time.time()
model.fit(X_train_reduced, y_train)
# Predict probabilities on the test set
y_prob = model.predict_proba(X_test_reduced)[:, 1]
# Evaluate performance
results_reduced[name] = {
'Accuracy': accuracy_score(y_test, model.predict(X_test_reduced)),
'Precision': precision_score(y_test, model.predict(X_test_reduced)),
'Recall': recall_score(y_test, model.predict(X_test_reduced)),
'F1 Score': f1_score(y_test, model.predict(X_test_reduced)),
'AUC-ROC': roc_auc_score(y_test, y_prob),
"Training Time (sec)" : time.time() - start_time
}
# Display the results
reduced_results_df = pd.DataFrame(results_reduced).T
Lastly, I will use the same predictive models on data that has been reduced via PCA and preserving 95% of the variance.
pca = PCA(n_components=0.95) # n_components retain 95% of variance
X_pca = pca.fit_transform(X)
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
# Plot explained variance
plt.plot(cumulative_explained_variance)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of Components (RobustScaler)')
plt.show()
# Determine the number of components to keep
n_components_to_keep = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f"Number of components to keep (RobustScaler): {n_components_to_keep}")
# Apply PCA with the selected number of components
pca_final = PCA(n_components=n_components_to_keep)
X_final_pca = pca_final.fit_transform(X)
# Split the data into training and testing sets
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_final_pca, y, test_size=0.2, random_state=42)
# Initialize models
models_pca = {
'Logistic Regression': LogisticRegression(),
'Random Forest': RandomForestClassifier(),
'Decision Tree': DecisionTreeClassifier(),
'XGBoost': XGBClassifier()
}
# Train and evaluate each model
results_pca = {}
for name, model in models_pca.items():
start_time = time.time()
model.fit(X_train_pca, y_train)
# Predict probabilities on the test set
y_prob = model.predict_proba(X_test_pca)[:, 1]
# Evaluate performance
results_pca[name] = {
'Accuracy': accuracy_score(y_test, model.predict(X_test_pca)),
'Precision': precision_score(y_test, model.predict(X_test_pca)),
'Recall': recall_score(y_test, model.predict(X_test_pca)),
'F1 Score': f1_score(y_test, model.predict(X_test_pca)),
'AUC-ROC': roc_auc_score(y_test, y_prob),
"Training Time (sec)" : time.time() - start_time
}
# Display the results
results_pca_df = pd.DataFrame(results_pca).T
results_df # Results after using classification models with all features
reduced_results_df # Results after using classification models with only the top 5 important features as determined by the initial random forest
results_pca_df # Results after using classification models on PCA reduced data by 16 principal components to capture 95% of the variance
Comparing the model performances between data of different dimensions we gain more valuable insights
We have successfully reduced the training time of the models by utilizing dimensionality reduction. When using only the top 5 features, we were able to achieve performance that was nearly identical to the original models utilizing all the features, however the training time was cut by more than half especially for the Random Forest. Therefore, given how little the performance changes, the only features of importance are V4, V10, V12, V14, V17, and utilizing only those features gives similar results to using all the features but with far less training time.